library(tidyverse)
library(scales)

Fisheries of the world

Fisheries and Aquaculture Department of the Food and Agriculture Organization of the United Nations collects data on fisheries production of countries. The (not-so-great) visualization below shows the distribution of fishery harvest of countries for 2016, by capture and aquaculture.

  • Countries whose total harvest was less than 100,000 tons are not included in the visualization.
  • Source: Fishing industry by country ]
Question: What are some ways you would improve this visualization?

  • Calculate summary statistics at the continent level and visualize them.
  • Map the data.

The data

Let’s load the data:

fisheries <- read_csv("data/fisheries.csv")
## Parsed with column specification:
## cols(
##   country = col_character(),
##   capture = col_double(),
##   aquaculture = col_double(),
##   total = col_double()
## )
names(fisheries)
## [1] "country"     "capture"     "aquaculture" "total"

And inspect it:

fisheries
## # A tibble: 216 x 4
##    country             capture aquaculture  total
##    <chr>                 <dbl>       <dbl>  <dbl>
##  1 Afghanistan            1000        1200   2200
##  2 Albania                7886         950   8836
##  3 Algeria               95000        1361  96361
##  4 American Samoa         3047          20   3067
##  5 Andorra                   0           0      0
##  6 Angola               486490         655 487145
##  7 Antigua and Barbuda    3000          10   3010
##  8 Argentina            755226        3673 758899
##  9 Armenia                3758       16381  20139
## 10 Aruba                   142           0    142
## # … with 206 more rows

Data prep

Filter out countries whose total harvest was less than 100,000 tons since they are not included in the visualization:

fisheries <- fisheries %>%
  filter(total > 100000)
fisheries
## # A tibble: 82 x 4
##    country    capture aquaculture   total
##    <chr>        <dbl>       <dbl>   <dbl>
##  1 Angola      486490         655  487145
##  2 Argentina   755226        3673  758899
##  3 Australia   174629       96847  271476
##  4 Bangladesh 1674770     2203554 3878324
##  5 Brazil      705000      581230 1286230
##  6 Cambodia    629950      172500  802450
##  7 Cameroon    233190        2315  235505
##  8 Canada      874727      200765 1075492
##  9 Chad        110000          94  110094
## 10 Chile      1829238     1050117 2879355
## # … with 72 more rows

Then, we will join this with the continent data.

continents <- read_csv("data/continents.csv")
## Parsed with column specification:
## cols(
##   country = col_character(),
##   continent = col_character()
## )
continents
## # A tibble: 245 x 2
##    country           continent
##    <chr>             <chr>    
##  1 Afghanistan       Asia     
##  2 Ã…land Islands     Europe   
##  3 Albania           Europe   
##  4 Algeria           Africa   
##  5 American Samoa    Oceania  
##  6 Andorra           Europe   
##  7 Angola            Africa   
##  8 Anguilla          Americas 
##  9 Antigua & Barbuda Americas 
## 10 Argentina         Americas 
## # … with 235 more rows

Data joins

Joining data frames

something_join(x, y)
  • inner_join(): all rows from x where there are matching values in y, return all combination of multiple matches in the case of multiple matches
  • left_join(): all rows from x
  • right_join(): all rows from y
  • full_join(): all rows from both x and y
  • anti_join(): return all rows from x where there are not matching values in y, never duplicate rows of x
  • …

For the next bit…

x
## # A tibble: 3 x 1
##   value
##   <dbl>
## 1     1
## 2     2
## 3     3
y
## # A tibble: 3 x 1
##   value
##   <dbl>
## 1     1
## 2     2
## 3     4

inner_join()

inner_join(x, y)
## Joining, by = "value"
## # A tibble: 2 x 1
##   value
##   <dbl>
## 1     1
## 2     2

left_join()

left_join(x, y)
## Joining, by = "value"
## # A tibble: 3 x 1
##   value
##   <dbl>
## 1     1
## 2     2
## 3     3

right_join()

right_join(x, y)
## Joining, by = "value"
## # A tibble: 3 x 1
##   value
##   <dbl>
## 1     1
## 2     2
## 3     4

full_join()

full_join(x, y)
## Joining, by = "value"
## # A tibble: 4 x 1
##   value
##   <dbl>
## 1     1
## 2     2
## 3     3
## 4     4

anti_join()

anti_join(x, y)
## Joining, by = "value"
## # A tibble: 1 x 1
##   value
##   <dbl>
## 1     3

Question: We want to keep all rows and columns from `fisheries` and add a 
column for corresponding continents. Which join function should we use?
fisheries %>% select(country)
## # A tibble: 82 x 1
##    country   
##    <chr>     
##  1 Angola    
##  2 Argentina 
##  3 Australia 
##  4 Bangladesh
##  5 Brazil    
##  6 Cambodia  
##  7 Cameroon  
##  8 Canada    
##  9 Chad      
## 10 Chile     
## # … with 72 more rows
continents
## # A tibble: 245 x 2
##    country           continent
##    <chr>             <chr>    
##  1 Afghanistan       Asia     
##  2 Ã…land Islands     Europe   
##  3 Albania           Europe   
##  4 Algeria           Africa   
##  5 American Samoa    Oceania  
##  6 Andorra           Europe   
##  7 Angola            Africa   
##  8 Anguilla          Americas 
##  9 Antigua & Barbuda Americas 
## 10 Argentina         Americas 
## # … with 235 more rows

Join fisheries and continents

fisheries <- left_join(fisheries, continents) 
## Joining, by = "country"
Question: How does `left_join()` know to join the two data frames by `country`?

Hint:

  • Variables in the original fisheries dataset:
## [1] "country"     "capture"     "aquaculture" "total"
  • Variables in the continents dataset:
## [1] "country"   "continent"

Let’s check to make sure all countries now have a continent assigned.

fisheries %>%
  filter(is.na(continent))
## # A tibble: 3 x 5
##   country                          capture aquaculture   total continent
##   <chr>                              <dbl>       <dbl>   <dbl> <chr>    
## 1 Democratic Republic of the Congo  237372        3161  240533 <NA>     
## 2 Hong Kong                         142775        4258  147033 <NA>     
## 3 Myanmar                          2072390     1017644 3090034 <NA>

Nope!

We will need to manually fix some of these.

fisheries <- fisheries %>%
  mutate(continent = case_when(
    country == "Democratic Republic of the Congo" ~ "Africa",
    country == "Hong Kong"                        ~ "Asia",
    country == "Myanmar"                          ~ "Asia",
    TRUE                                          ~ continent
    )
  )

…and check again

fisheries %>%
  filter(is.na(continent))
## # A tibble: 0 x 5
## # … with 5 variables: country <chr>, capture <dbl>, aquaculture <dbl>,
## #   total <dbl>, continent <chr>
Question: What does the following code do?
fisheries <- fisheries %>%
  mutate(aquaculture_perc = aquaculture / total)

Continent-level statistics

fisheries_summary <- fisheries %>%
  group_by(continent) %>%
  summarise(
    min_ap  = min(aquaculture_perc),
    mean_ap = mean(aquaculture_perc),
    max_ap  = max(aquaculture_perc)
  ) 

fisheries_summary
## # A tibble: 5 x 4
##   continent  min_ap mean_ap max_ap
##   <chr>       <dbl>   <dbl>  <dbl>
## 1 Africa    0        0.0943  0.803
## 2 Americas  0        0.192   0.529
## 3 Asia      0        0.367   0.782
## 4 Europe    0.00682  0.165   0.618
## 5 Oceania   0.0197   0.150   0.357

Visualize continent summary stats

ggplot(fisheries_summary, aes(x = continent, y = mean_ap)) +
  geom_col()

Improve visualization

ggplot(fisheries_summary, 
       aes(y = fct_reorder(continent, mean_ap), x = mean_ap)) +
  geom_col() +
  scale_x_continuous(labels = percent) + #<<
  labs(
    x = "", y = "",
    title = "Average share of aquaculture by continent",
    subtitle = "out of total fisheries harvest, 2016",
    caption = "Source: bit.ly/2VrawTt"
  ) +
  theme_minimal()

Mapping

Mapping the fisheries data

  • Obtain country boundaries and store as a data frame
  • Join the fisheries and country boundaries data frames
  • Plot the country boundaries, and fill by fisheries harvest data

The map_data() function easily turns data from the maps package in to a data frame suitable for plotting with ggplot2:

head(map_data("world"))
##        long      lat group order region subregion
## 1 -69.89912 12.45200     1     1  Aruba      <NA>
## 2 -69.89571 12.42300     1     2  Aruba      <NA>
## 3 -69.94219 12.43853     1     3  Aruba      <NA>
## 4 -70.00415 12.50049     1     4  Aruba      <NA>
## 5 -70.06612 12.54697     1     5  Aruba      <NA>
## 6 -70.05088 12.59707     1     6  Aruba      <NA>

A few fixes for better matching

Question: What does the following code do?
world_map <- map_data("world") %>%
  mutate(region = case_when(
    region == "UK"           ~ "United Kingdom",
    region == "USA"          ~ "United States",
    subregion == "Hong Kong" ~ "Hong Kong",
    TRUE                     ~ region
    )
  )

Map the world

ggplot(world_map, aes(x = long, y = lat, group = group)) +
  geom_polygon(fill = "gray") +
  theme_minimal()

Join fisheries and world map

fisheries_map <- left_join(fisheries, world_map, 
                           by = c("country" = "region"))
glimpse(fisheries_map)
## Rows: 72,685
## Columns: 11
## $ country          <chr> "Angola", "Angola", "Angola", "Angola", "Angola", "A…
## $ capture          <dbl> 486490, 486490, 486490, 486490, 486490, 486490, 4864…
## $ aquaculture      <dbl> 655, 655, 655, 655, 655, 655, 655, 655, 655, 655, 65…
## $ total            <dbl> 487145, 487145, 487145, 487145, 487145, 487145, 4871…
## $ continent        <chr> "Africa", "Africa", "Africa", "Africa", "Africa", "A…
## $ aquaculture_perc <dbl> 0.001344569, 0.001344569, 0.001344569, 0.001344569, …
## $ long             <dbl> 23.96650, 23.98828, 24.01006, 24.02559, 24.04141, 24…
## $ lat              <dbl> -10.87178, -11.00283, -11.18477, -11.31563, -11.3741…
## $ group            <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3…
## $ order            <int> 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 43…
## $ subregion        <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …

Mapping fisheries

ggplot(fisheries_map, mapping = aes(x = long, y = lat, group = group)) +
  geom_polygon(aes(fill = capture)) +
  scale_fill_viridis_c() +
  theme_minimal()

Question: What is misleading about the map above?

Putting it altogether

ggplot() +
  geom_polygon(world_map, 
               mapping = aes(x = long, y = lat, group = group), 
               fill = "lightgray") +
  geom_polygon(fisheries_map, 
               mapping = aes(x = long, y = lat, group = group, 
                             fill = capture)) +
  scale_fill_viridis_c() +
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(
    x = "", y = "",
    title = "Fisheries harvest by capture, 2016",
    subtitle = "Capture measured in tonnes",
    caption = "Source: bit.ly/2VrawTt"
  )

Log scale

ggplot() +
  geom_polygon(world_map, mapping = aes(x = long, y = lat, group = group), fill = "lightgray") +
  geom_polygon(fisheries_map, mapping = aes(x = long, y = lat, group = group, fill = log(capture))) +
  scale_fill_viridis_c() +
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(
    x = "", y = "",
    title = "Fisheries harvest by capture, 2016",
    subtitle = "Capture measured in logged tonnes",
    caption = "Source: bit.ly/2VrawTt"
  )

Aquaculture

ggplot() +
  geom_polygon(world_map, mapping = aes(x = long, y = lat, group = group), fill = "lightgray") +
  geom_polygon(fisheries_map, mapping = aes(x = long, y = lat, group = group, fill = log(aquaculture+1))) +
  scale_fill_viridis_c() +
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(
    x = "", y = "",
    title = "Fisheries harvest by aquaculture, 2016",
    subtitle = "Aquaculture measured in logged tonnes",
    fill = "log(aquaculture)",
    caption = "Source: bit.ly/2VrawTt"
  )

fisheries_map <- fisheries_map %>%
  mutate(
    aquaculture_perc = aquaculture / total
  )

ggplot() +
  geom_polygon(world_map, 
               mapping = aes(x = long, y = lat, group = group), 
               fill = "lightgray") +
  geom_polygon(fisheries_map, 
               mapping = aes(x = long, y = lat, group = group, 
                             fill = aquaculture_perc)) +
  scale_fill_viridis_c(labels = percent_format(accuracy = 1)) +
  theme_minimal() +
  theme(legend.position = "bottom", legend.key.width = unit(2, "lines")) +
  labs(
    x = "", y = "",
    title = "Average share of aquaculture by country",
    subtitle = "out of total fisheries harvest, 2016",
    caption = "Source: bit.ly/2VrawTt",
    fill = "Aquaculture %"
  )